import plotly.offline as pyo
# Set notebook mode to work in offline
pyo.init_notebook_mode()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import geopandas as gpd
import pprint
import numpy as np
#plt.rcParams['figure.figsize'] = [20, 15]
df_listings_detailed = pd.read_csv("./data_airbnb_syd/20211106/listings_detailed.csv",
header='infer',
parse_dates=['last_scraped', 'host_since', 'calendar_last_scraped'])
df_listings_detailed.head(2)
| id | listing_url | scrape_id | last_scraped | name | description | neighborhood_overview | picture_url | host_id | host_url | ... | review_scores_communication | review_scores_location | review_scores_value | license | instant_bookable | calculated_host_listings_count | calculated_host_listings_count_entire_homes | calculated_host_listings_count_private_rooms | calculated_host_listings_count_shared_rooms | reviews_per_month | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 11156 | https://www.airbnb.com/rooms/11156 | 20211106153124 | 2021-11-07 | An Oasis in the City | Very central to the city which can be reached ... | It is very close to everything and everywhere,... | https://a0.muscache.com/pictures/2797669/17895... | 40855 | https://www.airbnb.com/users/show/40855 | ... | 4.77 | 4.82 | 4.71 | NaN | f | 1 | 0 | 1 | 0 | 2.02 |
| 1 | 14250 | https://www.airbnb.com/rooms/14250 | 20211106153124 | 2021-11-07 | Manly Harbour House | Beautifully renovated, spacious and quiet, our... | Balgowlah Heights is one of the most prestigio... | https://a0.muscache.com/pictures/56935671/fdb8... | 55948 | https://www.airbnb.com/users/show/55948 | ... | 4.33 | 4.67 | 4.33 | Exempt | f | 2 | 2 | 0 | 0 | 0.04 |
2 rows × 74 columns
pprint.pprint(df_listings_detailed.shape)
pprint.pprint(df_listings_detailed.info())
(20513, 74) <class 'pandas.core.frame.DataFrame'> RangeIndex: 20513 entries, 0 to 20512 Data columns (total 74 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 20513 non-null int64 1 listing_url 20513 non-null object 2 scrape_id 20513 non-null int64 3 last_scraped 20513 non-null datetime64[ns] 4 name 20506 non-null object 5 description 19741 non-null object 6 neighborhood_overview 12365 non-null object 7 picture_url 20513 non-null object 8 host_id 20513 non-null int64 9 host_url 20513 non-null object 10 host_name 20508 non-null object 11 host_since 20508 non-null datetime64[ns] 12 host_location 20493 non-null object 13 host_about 10978 non-null object 14 host_response_time 7217 non-null object 15 host_response_rate 7217 non-null object 16 host_acceptance_rate 8565 non-null object 17 host_is_superhost 20508 non-null object 18 host_thumbnail_url 20508 non-null object 19 host_picture_url 20508 non-null object 20 host_neighbourhood 11738 non-null object 21 host_listings_count 20508 non-null float64 22 host_total_listings_count 20508 non-null float64 23 host_verifications 20513 non-null object 24 host_has_profile_pic 20508 non-null object 25 host_identity_verified 20508 non-null object 26 neighbourhood 12367 non-null object 27 neighbourhood_cleansed 20513 non-null object 28 neighbourhood_group_cleansed 0 non-null float64 29 latitude 20513 non-null float64 30 longitude 20513 non-null float64 31 property_type 20513 non-null object 32 room_type 20513 non-null object 33 accommodates 20513 non-null int64 34 bathrooms 0 non-null float64 35 bathrooms_text 20489 non-null object 36 bedrooms 19078 non-null float64 37 beds 19646 non-null float64 38 amenities 20513 non-null object 39 price 20513 non-null object 40 minimum_nights 20513 non-null int64 41 maximum_nights 20513 non-null int64 42 minimum_minimum_nights 20513 non-null int64 43 maximum_minimum_nights 20513 non-null int64 44 minimum_maximum_nights 20513 non-null int64 45 maximum_maximum_nights 20513 non-null int64 46 minimum_nights_avg_ntm 20513 non-null float64 47 maximum_nights_avg_ntm 20513 non-null float64 48 calendar_updated 0 non-null float64 49 has_availability 20513 non-null object 50 availability_30 20513 non-null int64 51 availability_60 20513 non-null int64 52 availability_90 20513 non-null int64 53 availability_365 20513 non-null int64 54 calendar_last_scraped 20513 non-null datetime64[ns] 55 number_of_reviews 20513 non-null int64 56 number_of_reviews_ltm 20513 non-null int64 57 number_of_reviews_l30d 20513 non-null int64 58 first_review 14878 non-null object 59 last_review 14878 non-null object 60 review_scores_rating 14878 non-null float64 61 review_scores_accuracy 14260 non-null float64 62 review_scores_cleanliness 14270 non-null float64 63 review_scores_checkin 14253 non-null float64 64 review_scores_communication 14270 non-null float64 65 review_scores_location 14254 non-null float64 66 review_scores_value 14249 non-null float64 67 license 6761 non-null object 68 instant_bookable 20513 non-null object 69 calculated_host_listings_count 20513 non-null int64 70 calculated_host_listings_count_entire_homes 20513 non-null int64 71 calculated_host_listings_count_private_rooms 20513 non-null int64 72 calculated_host_listings_count_shared_rooms 20513 non-null int64 73 reviews_per_month 14878 non-null float64 dtypes: datetime64[ns](3), float64(19), int64(21), object(31) memory usage: 11.6+ MB None
i = 0
for col, val in df_listings_detailed.iloc[0].iteritems():
print(i, '\t', col, '\t', val)
i += 1
0 id 11156 1 listing_url https://www.airbnb.com/rooms/11156 2 scrape_id 20211106153124 3 last_scraped 2021-11-07 00:00:00 4 name An Oasis in the City 5 description Very central to the city which can be reached by an easy walk or by bus, with transport at the door, if required, and all amenities within easy reach.<br /><br /><b>The space</b><br />Potts Pt. is a vibrant and popular inner-city suburb & the area was described as 'Australia's Bohemian Heart' in the article by Raymond Bonner in the New York Times . Ultra-convenient & within walking distance of the central business district & all tourist sites such as: <br />the Botanic Gardens, <br />Opera House, <br />NSW Art Gallery, <br />Australian Museum, <br />Circular Quay, the Sydney Opera House, and BridgeClimb<br />Darling Harbour<br />Museum of Contemporary Art<br />Art Gallery of NSW<br />Sydney Museum<br />Hyde Park<br />Botanic Gardens<br />Chinatown, Paddy's Markets, and the Chinese Gardens<br />Paddington (& the Saturday markets)<br />The historic Rocks area (& the markets on Saturday & Sunday)<br /><br />It is 7 minutes walk to the Kings Cross.train (Metro) station & buses to the be 6 neighborhood_overview It is very close to everything and everywhere, has many trees along the streets and is in a beautiful part of the city 7 picture_url https://a0.muscache.com/pictures/2797669/17895d03_original.jpg 8 host_id 40855 9 host_url https://www.airbnb.com/users/show/40855 10 host_name Colleen 11 host_since 2009-09-23 00:00:00 12 host_location Potts Point, New South Wales, Australia 13 host_about Recently retired, I've lived & worked on 4 continents & over the years have travelled thru about 50 countries. I live in inner-Sydney within walking distance of most places (except the beaches, which are approx. 30 minutes. away on public transport). Interests are travelling, wildlife, reading,snorkelling, theatre, ballet & interesting food. 14 host_response_time nan 15 host_response_rate nan 16 host_acceptance_rate nan 17 host_is_superhost f 18 host_thumbnail_url https://a0.muscache.com/im/users/40855/profile_pic/1259121939/original.jpg?aki_policy=profile_small 19 host_picture_url https://a0.muscache.com/im/users/40855/profile_pic/1259121939/original.jpg?aki_policy=profile_x_medium 20 host_neighbourhood Potts Point 21 host_listings_count 1.0 22 host_total_listings_count 1.0 23 host_verifications ['email', 'phone', 'reviews'] 24 host_has_profile_pic t 25 host_identity_verified f 26 neighbourhood Potts Point, New South Wales, Australia 27 neighbourhood_cleansed Sydney 28 neighbourhood_group_cleansed nan 29 latitude -33.86767 30 longitude 151.22497 31 property_type Private room in rental unit 32 room_type Private room 33 accommodates 1 34 bathrooms nan 35 bathrooms_text 1 shared bath 36 bedrooms 1.0 37 beds nan 38 amenities ["TV", "Kitchen", "Refrigerator", "Extra pillows and blankets", "Cooking basics", "Stove", "Washer", "Dishwasher", "Hot water", "Dishes and silverware", "Essentials", "Backyard", "Oven", "Long term stays allowed", "Elevator", "Smoke alarm", "Single level home", "Hangers", "Shower gel", "Fire extinguisher", "Bed linens", "Wifi", "Microwave", "Hair dryer", "Shampoo", "Heating", "Iron", "Patio or balcony"] 39 price $65.00 40 minimum_nights 90 41 maximum_nights 180 42 minimum_minimum_nights 90 43 maximum_minimum_nights 90 44 minimum_maximum_nights 180 45 maximum_maximum_nights 180 46 minimum_nights_avg_ntm 90.0 47 maximum_nights_avg_ntm 180.0 48 calendar_updated nan 49 has_availability t 50 availability_30 29 51 availability_60 59 52 availability_90 89 53 availability_365 364 54 calendar_last_scraped 2021-11-07 00:00:00 55 number_of_reviews 196 56 number_of_reviews_ltm 0 57 number_of_reviews_l30d 0 58 first_review 2013-11-12 59 last_review 2020-03-05 60 review_scores_rating 4.6 61 review_scores_accuracy 4.76 62 review_scores_cleanliness 4.31 63 review_scores_checkin 4.8 64 review_scores_communication 4.77 65 review_scores_location 4.82 66 review_scores_value 4.71 67 license nan 68 instant_bookable f 69 calculated_host_listings_count 1 70 calculated_host_listings_count_entire_homes 0 71 calculated_host_listings_count_private_rooms 1 72 calculated_host_listings_count_shared_rooms 0 73 reviews_per_month 2.02
df_listings_detailed['last_scraped'].value_counts()
2021-11-07 12543 2021-11-06 7970 Name: last_scraped, dtype: int64
print('Number of properties:', df_listings_detailed['id'].unique().shape[0])
print('Number of hosts:', df_listings_detailed['host_id'].unique().shape[0])
Number of properties: 20513 Number of hosts: 14977
host_property_number = df_listings_detailed.groupby('host_id')['id'].nunique().value_counts()
host_property_number = host_property_number.reset_index()
host_property_number.columns = ["host_listing_number", "host_number"]
#host_property_number
print('The least number of properties listed by one host:', host_property_number.min()[0])
print('The largets number of proeprties list by one host:', host_property_number.max()[0])
print('In average, one host lists properties:', host_property_number.mean()[0])
print('Median number of properties listed by one host:', host_property_number.median()[0])
fig = px.bar(host_property_number,
x='host_listing_number',
y='host_number',
labels={'host_listing_number': 'Number of Properties Listed by Same Host',
'host_number': 'Number of Hosts (log)'
},
log_y=True,
title="NSW AirBnb Hosts' Listing Numbers "
)
fig.show()
The least number of properties listed by one host: 1 The largets number of proeprties list by one host: 188 In average, one host lists properties: 33.07142857142857 Median number of properties listed by one host: 22.5
df_property_types = df_listings_detailed['property_type'].value_counts()
df_property_types = df_property_types.reset_index()
df_property_types.columns = ['property_type', 'number']
print("In the dateset, {} types of accommodations are listed.".format(df_property_types.shape[0]))
print("The largest number of accommodations are common types such as unit, room in unit, residential home.")
print("However, there are small numbers of unusual accommodation types e.g. earth house, tent.")
df_property_types
fig = px.pie(df_property_types,
names='property_type',
values='number',
title="AirBnb Property Types"
)
fig.update_traces(hoverinfo='label+percent+name', textinfo='none')
fig.update_layout(width=800,
height=800
)
fig.show()
df_room_types = df_listings_detailed['room_type'].value_counts()
df_room_types = df_room_types.reset_index()
df_room_types.columns = ['room_type', 'number']
print("Room types are rather limited. There are {} types of rooms listed.".format(df_room_types.shape[0]))
print("Majority of listings are made up of entire home/apt and private room.")
df_room_types
fig = px.pie(df_room_types,
names='room_type',
values='number',
title="AirBnb Room Types"
)
fig.update_traces(hoverinfo='label+percent+name')
fig.update_layout(width=800,
height=800
)
fig.show()
df_listings_detailed['price_in_number'] = df_listings_detailed['price'].str.replace('$', '')
df_listings_detailed['price_in_number'] = df_listings_detailed['price_in_number'].str.replace(',','')
df_listings_detailed['price_in_number'] = df_listings_detailed['price_in_number'].astype(float).astype(int)
fig = px.violin(df_listings_detailed,
y="price_in_number",
x="room_type",
color="room_type",
labels={'room_type': 'Room Type',
'price_in_number': 'Price'},
box=False,
points='outliers',#points="all",
hover_data=['price'],
#width=2000,
#height=2000,
#log_y=True,
title='Price of different types of rooms'
)
fig.show()
In the dateset, 83 types of accommodations are listed. The largest number of accommodations are common types such as unit, room in unit, residential home. However, there are small numbers of unusual accommodation types e.g. earth house, tent.
Room types are rather limited. There are 4 types of rooms listed. Majority of listings are made up of entire home/apt and private room.
<ipython-input-89-ca70f7a61bb2>:42: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
df_listings_number_neighbourhood = df_listings_detailed.groupby('neighbourhood_cleansed')['id'].nunique()
df_listings_number_neighbourhood = df_listings_number_neighbourhood.reset_index()
df_listings_number_neighbourhood.columns = ['neighbourhood', 'property_number']
#df_listings_number_neighbourhood
fig = px.bar(df_listings_number_neighbourhood,
x='neighbourhood',
y='property_number',
hover_data=['neighbourhood', 'property_number'],
labels={'neighbourhood': 'Neighbourhood', 'property_number': 'Number of Properties'},
title='Number of properties in neighbourhoods',
color='neighbourhood',
log_y=True,
height=400
)
fig.show()
fig = px.violin(df_listings_detailed.sort_values(by=['neighbourhood_cleansed']),
y="price_in_number",
x="neighbourhood_cleansed",
color="neighbourhood_cleansed",
labels={'neighbourhood_cleansed': 'Neighbourhood',
'price_in_number': 'Price'},
box=False,
points='outliers',#points="all",
hover_data=['price'],
#width=2000,
height=400,
log_y=True,
title='Prices in Neighbourhoods'
)
fig.show()
print('Earlist host:', df_listings_detailed['host_since'].min())
print('Latest host:', df_listings_detailed['host_since'].max())
df_listings_detailed['host_since_date'] = df_listings_detailed['host_since'].dt.year
df_enter_business_date = df_listings_detailed['host_since_date'].value_counts().reset_index()
df_enter_business_date.columns = ['enter_business_date', 'number']
df_enter_business_date = df_enter_business_date.sort_values(by=['enter_business_date'])
df_enter_business_date
fig = px.bar(df_enter_business_date,
x='enter_business_date',
y='number',
labels={'enter_business_date': 'Enter Business Date',
'number': 'Number of Hosts'
},
#log_y=True,
title="Hosts Enter Buisiness Date"
)
fig.show()
Earlist host 2009-03-20 00:00:00 Latest host 2021-11-04 00:00:00